Test CellMixS metrics on Splatter synthetic data
Introduction
We are interested in evaluating integration of scRNA-seq datasets of heterogeneous cell type composition and batch size, which are typical in tumor samples. Here we will test CellMixS batch effect metrics on synthetic datasets generated with splatter We will simulate datasets that can contain one or two of two possible cell types, with different levels of batch effects and relative batch sizes
R environment
Get and load some useful packages
renv::restore()
if (!require("remotes", quietly = TRUE))
install.packages("remotes")
library(remotes)
if (!require("tidyr", quietly = TRUE))
install.packages("tidyr")
if (!require("scIntegrationMetrics", quietly = TRUE))
install_github("carmonalab/scIntegrationMetrics") #calculates LISI and Silhouette
if (!requireNamespace("BiocManager"))
install.packages("BiocManager")
if (!requireNamespace("CellMixS"))
BiocManager::install("CellMixS")
if (!requireNamespace("CellMixS"))
BiocManager::install("splatter")library(dplyr)
library(ggplot2)
library(tidyr)
library(scIntegrationMetrics)
library(patchwork)
library(CellMixS)
library(splatter)
library(Seurat)
library(BiocParallel)
seed = 1234
set.seed(seed)# Load required packages
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(dplyr)
library(purrr)
library(ggplot2)
library(scater)
})Prepare extreme heterogeneous datasets (batch effects + different subtype composition) to evaluate metrics
# The next function will generate synthetic datasets, in two possible modes:
# mode twoGroups=T
# will generate 3 batches including two cell types/groups
# Batch 1: contains cell types A and B with equal proportions
# Batch 2: contains only cell type A (cells Group 1)
# Batch 3: contains only cell type B (cells Group 2)
# Batch 1 contains a frequency of `freqBatchAB` of total cells; and the rest are equally devided between Batches 2 and 3.
#
# mode twoGroups=F
# will generate 2 baches with 1 cell type
# then will evaluate a number of metrics from the CellMixS Bioconductor package
simulateAndMeasure <- function(ncellsTotal, freqBatchAB, freqBatchA=NULL, twoGroups=T, de.facLoc=0.2, de.facScale=0.2, batch.facScale=0, batch.facLoc=0, mySeed=123, k=20, cell_min=10, batch_min=NULL, ncores=8){
if(is.null(freqBatchA)) {
freqBatchA <- (1-freqBatchAB)/2
}
freqBatchB <- 1-freqBatchAB-freqBatchA
if (twoGroups){
group.prob <- 0.5 # cell type A frequency; avoid group.prob close to 0 or 1
batchCells <- round(c(freqBatchAB,freqBatchA/group.prob,freqBatchB/(1-group.prob))*ncellsTotal)
test <- splatSimulate(batchCells = batchCells, group.prob = c(group.prob, 1-group.prob), method = "groups", verbose = FALSE, de.facLoc=de.facLoc, de.facScale=de.facScale, batch.facScale= batch.facScale, batch.facLoc=batch.facLoc ) # see ?SplatParams de.facLoc=0.2, de.facScale=0.2
test <- test[,!(test$Batch=="Batch2" & test$Group=="Group1")]
test <- test[,!(test$Batch=="Batch3" & test$Group=="Group2")]
} else {
batchCells <- round(c(freqBatchAB,1-freqBatchAB)*ncellsTotal)
test <- splatSimulate(batchCells = batchCells, verbose = FALSE, de.facLoc=de.facLoc, de.facScale=de.facScale, batch.facScale= batch.facScale, batch.facLoc=batch.facLoc )
test$Group <- "Group1"
}
set.seed(mySeed)
print(table(test$Group,test$Batch))
test <- logNormCounts(test)
test <- runPCA(test, ncomponents = 2, ntop=500)
#test <- evalIntegration(test, metrics = c("isi", "cms","mixingMetric","entropy"), group = "Batch", k = k, n_dim = 2, cell_min = cell_min, weight = F, BPPARAM=MulticoreParam(workers=ncores))
test <- evalIntegration(test, metrics = c("isi", "cms"), group = "Batch", k = k, n_dim = 2, cell_min = cell_min, weight = F, batch_min = batch_min, BPPARAM=MulticoreParam(workers=ncores))
if(twoGroups){
test <- evalIntegration(test, metrics = c("isi", "cms"), group = "Group", k = k, n_dim = 2, cell_min = cell_min, weight = F, batch_min = batch_min, BPPARAM=MulticoreParam(workers=ncores), res_name = c("cluster_lisi", "cluster_cms"))
}
return(test)
}# plot pca and metrics
plotSimul <- function(test){
plotPCA(test, shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test$Batch),":",table(test$Batch))) + coord_fixed() + labs(subtitle = sprintf("cms %.2f; lisi %.2f; cluster_cms %.2f; cluster_lisi %.2f",mean(test$cms_smooth),mean(test$isi),mean(test$cms_smooth.cluster_cms),mean(test$cluster_lisi)),
) | plotPCA(test, shape_by = "Batch", colour_by = "Group") + coord_fixed()
}Set parameters
k = 30 # nr. neighbors for lisi and cms
cell_min = 4 # cms: Minimum number of cells from each group to be included into the AD test
batch_min = 5 # cms: Minimum number of cells per batch to include in to the AD test. If set, neighbors will be included until batch_min cells from each batch are present.
ncores = 4
ncellsTotal = 1000
freqBatchAB = 0.5
test.list.single <- list()In the initial configuration (k=20 and default cms parameters), mean cms and lisi peak at equal batch size. Both metrics are strongly affected by batch size unbalance. The larger the unbalance, the lower cms/lisi. Here we try if other cms configurations make it robust to unbalance (using batch_min and increased k) while preserving other desirable properties
Batch effect magnitude
First, let’s evaluate how the different mixing metrics detect increasing levels of simulated batch effect:
for (batch.factor in seq(0,0.12,by=0.02)) {
name <- paste0("batch0_single_batchFactor_",batch.factor)
test.list.single[[name]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor, batch.facLoc = batch.factor, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min, twoGroups = F)
}##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 500 500
pp <- lapply(names(test.list.single), function(name) {
plotPCA(test.list.single[[name]], shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test.list.single[[name]]$Batch),":",table(test.list.single[[name]]$Batch))) + coord_fixed() + labs(subtitle = sprintf("cms %.2f; lisi %.2f",mean(test.list.single[[name]]$cms_smooth),mean(test.list.single[[name]]$isi))
) & ggtitle(name) & theme(aspect.ratio=1) | visHist(test.list.single[[name]],metric = c("cms"))
})
pp## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
pp.out <- wrap_plots(pp, ncol = 2)
ggsave("testMetricsSynthethicData_SingleCellType_BatchFactor.png",pp.out,width = 10,height = 10)test.list.single.metrics <- data.frame(batchFactor = sub("batch0_single_batchFactor_","",names(test.list.single)),
cms=sapply(test.list.single, function(x) mean(x$cms_smooth)),
lisi= sapply(test.list.single, function(x) mean(x$isi)))
test.list.single.metrics %>% pivot_longer(cols=c("cms","lisi")) %>% ggplot(aes(x=batchFactor, y=value, group=name)) + geom_line() + facet_wrap(~ name, scales = "free")Metrics cms and lisi behave as expected, decreasing with batch effects between the two simulated batches. Mean cms ranges from 0.5 for perfect mixing to 0, no mixing; lisi goes from almost 2 for perfect mixing to 1 for no mixing.
Differential batches abundances
Next, let’s evaluate how the metrics are affected by differential batches abundances
Parameters
ncellsTotal = 1000
test.list.single.balance <- list()for (myfreqBatchAB in seq(0.2,0.8,by=0.1)) {
name <- paste0("batch0_single_balance_",myfreqBatchAB)
test.list.single.balance[[name]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = myfreqBatchAB, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min, twoGroups = F)
}##
## Batch1 Batch2
## Group1 200 800
##
## Batch1 Batch2
## Group1 300 700
##
## Batch1 Batch2
## Group1 400 600
##
## Batch1 Batch2
## Group1 500 500
##
## Batch1 Batch2
## Group1 600 400
##
## Batch1 Batch2
## Group1 700 300
##
## Batch1 Batch2
## Group1 800 200
We get the warning that “There are less than ‘batch_min’ cells of each batch in a reasonable sized neighbourhood”
This is expected with
pp <- lapply(names(test.list.single.balance), function(name) {
plotPCA(test.list.single.balance[[name]], shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test.list.single.balance[[name]]$Batch),":",table(test.list.single.balance[[name]]$Batch))) + coord_fixed() + labs(subtitle = sprintf("cms %.2f; lisi %.2f",mean(test.list.single.balance[[name]]$cms_smooth),mean(test.list.single.balance[[name]]$isi))
) & ggtitle(name) & theme(aspect.ratio=1) | visHist(test.list.single.balance[[name]],metric = c("cms"))
})## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
pp## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
pp.out <- wrap_plots(pp, ncol = 2)
ggsave("testMetricsSynthethicData_SingleCellType_BatchBalance.png",pp.out,width = 10,height = 10)test.list.single.balance.metrics.balance <- data.frame(balance = sub("batch0_single_balance_","",names(test.list.single.balance)),
cms=sapply(test.list.single.balance, function(x) mean(x$cms_smooth)),
lisi= sapply(test.list.single.balance, function(x) mean(x$isi)))
test.list.single.balance.metrics.balance %>% pivot_longer(cols=c("cms","lisi")) %>% ggplot(aes(x=balance, y=value, group=name)) + geom_line() + facet_wrap(~ name, scales = "free")The pattern is the same for cms and lisi as for the default configuration. However, using batch_min (increasing neighborhood when needed) mitigates the relative drop of cms compared to max (50/50 balance) cms, from ~0.1 (default setting) to ~0.38 (batch_min) in the 20/80 batches balance setting. The relative impact is milder for cms (~25%) than for lisi (~50%)
Batches of heterogeneous cell type composition
Now let’s see how the mixing metrics behave when the batches contain different cell types that should not mix, with different levels of batch effect
Parameters
ncellsTotal = 1000
freqBatchAB = 1/3
batch.factor.strong = 0.1
batch.factor.small = 0.06
test.list <- list()test.list[["batchStrong"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor.strong, batch.facLoc = batch.factor.strong, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min)##
## Batch1 Batch2 Batch3
## Group1 177 0 328
## Group2 156 335 0
plotSimul(test.list[["batchStrong"]])## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
Here the large effects are large, and cells are similarly separately by
batch and cell type. Because batches do not overlap, lisi is ~1 and cms
is ~0. Similarly, cells of one type do not mix with cells of another
type, and thus ‘cluster/celltype lisi’ or ‘cluster/celltype cms’ are
also ~1 and ~0, respectively.
test.list[["batchMild"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor.small, batch.facLoc = batch.factor.small, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min)##
## Batch1 Batch2 Batch3
## Group1 171 0 333
## Group2 162 317 0
plotSimul(test.list[["batchMild"]])## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
Here with mild batch effects, cells are largely grouped by cell type,
and cells of different batches are more mixed. In this situation, both
cms and lisi increase. Cells of different type do not mix with each
other, so cluster_cms and cluster_lisi remain at minimum levels.
test.list[["batch0"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min)##
## Batch1 Batch2 Batch3
## Group1 155 0 318
## Group2 178 315 0
plotSimul(test.list[["batch0"]])## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
Without batch effects, cells of the same type from different batches are
perfectly mixed. In this situation, cms and lisi further increase.
In this big neighborhood configuration, it seems though that cms loses sensitivity to detect mild batch effect (increased from 0.43 with mild batch effects to 0.45 with no batch effects)
Cells of different type do not mix with each other, so cluster_cms and cluster_lisi remain at minimum levels
test.list[["batch0_unbalanced"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB/2, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min)
plotSimul(test.list[["batch0_unbalanced"]])test.list[["overcorrected"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = 0, batch.facLoc = 0, de.facLoc = 0, de.facScale = 0, k=k, cell_min=cell_min, ncores=ncores, batch_min=batch_min)##
## Batch1 Batch2 Batch3
## Group1 168 0 337
## Group2 165 328 0
plotSimul(test.list[["overcorrected"]])## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
If we applied an integration method that ‘overcorrects’ (ie that removes batch effect very strongly at the expense of also removing biological signal), all cells from different batches are perfectly mixed, irrespective of the cell type .
In this situation, cms and lisi further increase, indicating better batch mixing, irrespective of the non preservation of biological variation.
Now, because cells of different type mix with each other, cluster_cms and cluster_lisi strongly increase
This example illustrates that a higher cms/lisi is not necessarily indicating a better integration. In this case, increase of cms/lisi is associated with overcorrection and loss of cell type -specific signals
plot.list <- lapply(names(test.list),function(name){
plotSimul(test.list[[name]]) & ggtitle(name) & theme(aspect.ratio=1) | visHist(test.list[[name]],metric = c("cms"))
#plotPCA(test, shape_by = "Group", colour_by = "Batch") + ggtitle(sprintf("cms %.2f; lisi %.2f",mean(test$cms_smooth),mean(test$isi))) | visHist(test,metric = c("cms"))
})
wrap_plots(plot.list,ncol = 2)ggsave("testMetricsSynthethicData.png", width = 20, height = 10)lapply(test.list,function(test){
visOverview(test, "Batch", metric = c("cms", "isi"), other_var = "Group")
})Export datasets
ndim=2
exportLists <- list(batchEffect=test.list.single,batchBalance=test.list.single.balance,batchCellTypes=test.list)
for (task in names(exportLists)){
i <- exportLists[[task]]
seurat.list <- lapply(names(i), function(name){
sce <- i[[name]]
obj <- as.Seurat(sce, counts = "counts", data = "logcounts")
obj <- RenameAssays(obj, originalexp = 'RNA')
#obj <- NormalizeData(obj) %>% ScaleData() %>% RunPCA(npcs=ndim)
obj
})
names(seurat.list) <- names(i)
saveRDS(seurat.list,paste0(task,".synthetic.seurat.rds"))
}